Environment

I made the class for the environment using OpenAI gymnasium and initialized an environment.

class ConnectFourGym(gym.Env):
    def __init__(self, agent_policy="random"):
        ks_env = make("connectx", debug=True)
        self.ks_env = ks_env
        self.env = ks_env.train([None, agent_policy])
        self.rows = ks_env.configuration.rows
        self.columns = ks_env.configuration.columns
        self.action_space = spaces.Discrete(self.columns)
        self.observation_space = spaces.Box(low=0, high=2, shape=(1, self.rows, self.columns), dtype=int)
        self.reward_range = (-10, 3)
        self.spec = None
        self.metadata = None
        
    def reset(self):
        self.obs = self.env.reset()
        return np.array(self.obs['board']).reshape(1, self.rows, self.columns)
    
    def normalize(self, x):
        return 2 * (x + 10000) / (60000) - 1
    
    def change_reward(self, old_reward, done):
        if old_reward == 1:
            return 3
        elif done:
            return -3
        else:
            # if move is inconsequential immediately, give reward as heuristic scaled to [-1, 1]
            grid = np.asarray(self.obs['board']).reshape(self.rows, self.columns)
            return self.normalize(get_heuristic(grid, self.obs.mark, self.ks_env.configuration))
    
    def step(self, action):
        if (self.obs['board'][int(action)] == 0):
            self.obs, old_reward, done, _ = self.env.step(int(action))
            reward = self.change_reward(old_reward, done)
        else:
            # if move is invalid, penalize heavily
            reward, done, _ = -10, True, {}
        return np.array(self.obs['board']).reshape(1, self.rows, self.columns), reward, done, _

env = ConnectFourGym(agent_policy=alphabeta_agent)

alphabeta_agent is the analytical agent built in another notebook and explained in this page.

get_heuristic() is the function for scoring the position through a heuristic described here.

Essentially, the environment is made using the alphabeta_agent as the opponent to train against and the heuristic is used to make the reward function. Reward is normalized to between -1 and 1 except in the cases of termination of the game:

Win gets 3 points
Loss gets -3 points
Invalid move is penalized heavily with -10 points